This markdown has been created as a practice of some preliminary
concepts by:
Ruchi Sharma [Graduate Student, UT Austin]
## (a)
library(ISLR)
summary(College)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
names(College)
## [1] "Private" "Apps" "Accept" "Enroll" "Top10perc"
## [6] "Top25perc" "F.Undergrad" "P.Undergrad" "Outstate" "Room.Board"
## [11] "Books" "Personal" "PhD" "Terminal" "S.F.Ratio"
## [16] "perc.alumni" "Expend" "Grad.Rate"
dim(College) # 777 x 18
## [1] 777 18
## (b)
# fix(College)
college.rownames = rownames(College) # Creates Values in Envt
college = College[,-1] # Creates new dataframe with 17 variables now instead of 18
names(college)
## [1] "Apps" "Accept" "Enroll" "Top10perc" "Top25perc"
## [6] "F.Undergrad" "P.Undergrad" "Outstate" "Room.Board" "Books"
## [11] "Personal" "PhD" "Terminal" "S.F.Ratio" "perc.alumni"
## [16] "Expend" "Grad.Rate"
## (c)
summary(College)
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate
## Min. : 10.00
## 1st Qu.: 53.00
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
pairs(College[,1:10])
boxplot(Outstate~Private, data = College)
# binning
elite = rep("No", nrow(College))
elite[college$Top10perc>50] = "Yes"
elite = as.factor(elite)
College = data.frame(College, elite)
dim(College) # Another variable added to make it 19 from 18
## [1] 777 19
summary(College) # There are 78 elite universities
## Private Apps Accept Enroll Top10perc
## No :212 Min. : 81 Min. : 72 Min. : 35 Min. : 1.00
## Yes:565 1st Qu.: 776 1st Qu.: 604 1st Qu.: 242 1st Qu.:15.00
## Median : 1558 Median : 1110 Median : 434 Median :23.00
## Mean : 3002 Mean : 2019 Mean : 780 Mean :27.56
## 3rd Qu.: 3624 3rd Qu.: 2424 3rd Qu.: 902 3rd Qu.:35.00
## Max. :48094 Max. :26330 Max. :6392 Max. :96.00
## Top25perc F.Undergrad P.Undergrad Outstate
## Min. : 9.0 Min. : 139 Min. : 1.0 Min. : 2340
## 1st Qu.: 41.0 1st Qu.: 992 1st Qu.: 95.0 1st Qu.: 7320
## Median : 54.0 Median : 1707 Median : 353.0 Median : 9990
## Mean : 55.8 Mean : 3700 Mean : 855.3 Mean :10441
## 3rd Qu.: 69.0 3rd Qu.: 4005 3rd Qu.: 967.0 3rd Qu.:12925
## Max. :100.0 Max. :31643 Max. :21836.0 Max. :21700
## Room.Board Books Personal PhD
## Min. :1780 Min. : 96.0 Min. : 250 Min. : 8.00
## 1st Qu.:3597 1st Qu.: 470.0 1st Qu.: 850 1st Qu.: 62.00
## Median :4200 Median : 500.0 Median :1200 Median : 75.00
## Mean :4358 Mean : 549.4 Mean :1341 Mean : 72.66
## 3rd Qu.:5050 3rd Qu.: 600.0 3rd Qu.:1700 3rd Qu.: 85.00
## Max. :8124 Max. :2340.0 Max. :6800 Max. :103.00
## Terminal S.F.Ratio perc.alumni Expend
## Min. : 24.0 Min. : 2.50 Min. : 0.00 Min. : 3186
## 1st Qu.: 71.0 1st Qu.:11.50 1st Qu.:13.00 1st Qu.: 6751
## Median : 82.0 Median :13.60 Median :21.00 Median : 8377
## Mean : 79.7 Mean :14.09 Mean :22.74 Mean : 9660
## 3rd Qu.: 92.0 3rd Qu.:16.50 3rd Qu.:31.00 3rd Qu.:10830
## Max. :100.0 Max. :39.80 Max. :64.00 Max. :56233
## Grad.Rate elite
## Min. : 10.00 No :699
## 1st Qu.: 53.00 Yes: 78
## Median : 65.00
## Mean : 65.46
## 3rd Qu.: 78.00
## Max. :118.00
boxplot(Outstate~elite, data = College)
par(mfrow = c(2,2)) # divide window into 4 parts
hist(College$Accept, xlim = c(0, 25000), xlab = "Accepted", main = "Accepted using default bin sizes") # Default is 30
hist(College$Accept, xlim = c(0, 25000), breaks = 20, xlab = "Accepted", main = "Accepted using smaller bin sizes")
hist(College$Top25perc, breaks = 25, xlab = "%age of new students from 25% of High School", main = "Top 25% Students")
hist(College$perc.alumni, breaks = 10, xlab = "% alumni who donate", main = "Alumni")
# extending exploration
# Checking impact of Top10perc vs Grad.Rate
mean(College$Top10perc)
## [1] 27.55856
median(College$Top10perc)
## [1] 23
mean(College$Grad.Rate)
## [1] 65.46332
median(College$Grad.Rate)
## [1] 65
par(mfrow=c(1,1)) # re-setting par
plot(College$Top10perc, College$Grad.Rate, xlab = "Top 10%", ylab = 'Grad rate', main = "Grad rate vs Plot of S/F")
# Plotting a linear regression line
abline(lm(College$Grad.Rate~College$Top10perc), col = "green")
# Although there is a direct relation but spread of the data around the line is huge
# Thus, we have high standard deviation or error
# Plotting local regression line with smoothing of 25%
loessMod = loess(Grad.Rate ~ Top10perc, data = College, span = 0.25)
j = order(College$Top10perc)
lines(College$Top10perc[j], loessMod$fitted[j], col = "blue")
library(ISLR)
names(Auto)
## [1] "mpg" "cylinders" "displacement" "horsepower" "weight"
## [6] "acceleration" "year" "origin" "name"
## (a)
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
# Quantitative
# mpg, cylinder, displacement, horsepower, weight, acceleration, year
# Qualitative
# name, origin
# Although summary calculates stats for origin but it is evident that it is qualitative
temp = Auto$origin # We can see the values taken by temp are not continuous quantities
## (b)
# range can also be seen in the max min of summary # Method 1
range(Auto$mpg) # Method 2
## [1] 9.0 46.6
sapply(Auto[,1:7], range) # Method 3
## mpg cylinders displacement horsepower weight acceleration year
## [1,] 9.0 3 68 46 1613 8.0 70
## [2,] 46.6 8 455 230 5140 24.8 82
## (c)
sapply(Auto[,1:7], mean)
## mpg cylinders displacement horsepower weight acceleration
## 23.445918 5.471939 194.411990 104.469388 2977.584184 15.541327
## year
## 75.979592
sapply(Auto[,1:7], sd)
## mpg cylinders displacement horsepower weight acceleration
## 7.805007 1.705783 104.644004 38.491160 849.402560 2.758864
## year
## 3.683737
## (d)
Auto_Update <- Auto[-c(10:85),]
sapply(Auto_Update[,1:7], range)
## mpg cylinders displacement horsepower weight acceleration year
## [1,] 11.0 3 68 46 1649 8.5 70
## [2,] 46.6 8 455 230 4997 24.8 82
sapply(Auto_Update[,1:7], mean)
## mpg cylinders displacement horsepower weight acceleration
## 24.404430 5.373418 187.240506 100.721519 2935.971519 15.726899
## year
## 77.145570
sapply(Auto_Update[,1:7], sd)
## mpg cylinders displacement horsepower weight acceleration
## 7.867283 1.654179 99.678367 35.708853 811.300208 2.693721
## year
## 3.106217
## (e)
pairs(Auto[,1:7])
# Cylinders seems to be independent of other parameters
# There are some linear relationships, mpg and disp seem negatively correlated
cor(Auto[,1:7])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## acceleration year
## mpg 0.4233285 0.5805410
## cylinders -0.5046834 -0.3456474
## displacement -0.5438005 -0.3698552
## horsepower -0.6891955 -0.4163615
## weight -0.4168392 -0.3091199
## acceleration 1.0000000 0.2903161
## year 0.2903161 1.0000000
# High correlation bw cylinders and displacement, weight and displacement
## (f)
# Yes, can predict mpg based on other variables
# Inversely correlated: cylinders, displacement, horsepower, weight
# Can infer that as year increase i.e. newer models, mpg is higher
## (a)
library(MASS)
?Boston
# 506 rows X 14 columns - Housing Values in Suburbs of Boston
# rows - suburbs/towns
# Columns - predictors
# crim: per capita crime rate in town
# zn: proportion of residential land zoned for lots over 25,000 sq.ft.
# indus: proportion of non-retail business acres per town
# chas: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
# nox: nitrogen oxides concentration (parts per 10 million)
# rm: average number of rooms per dwelling
# age: proportion of owner-occupied units built prior to 1940
# dis: weighted mean of distances to five Boston employment centres
# rad: index of accessibility to radial highways
# tax: full-value property-tax rate per \$10,000
# ptratio: pupil-teacher ratio by town
# black: 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
# lstat: lower status of the population (percent)
# medv: median value of owner-occupied homes in \$1000s
## (b)
# taking quantitative variables for initial exploration
pairs(~crim + nox + rm + dis + rad + tax + medv, data = Boston)
pairs(~dis + medv, data = Boston) # trial for checking some pairs individually
# Some observations from the plots:
# 1. -ve linear reln between crim and medv; crim and dis
# 2. +ve linear reln betwen dis and medv
# 3. Not very high correlation among variables (maybe)
## (c)
# checking corr of crim vs other predictors
cor(Boston[-1],Boston$crim)
## [,1]
## zn -0.20046922
## indus 0.40658341
## chas -0.05589158
## nox 0.42097171
## rm -0.21924670
## age 0.35273425
## dis -0.37967009
## rad 0.62550515
## tax 0.58276431
## ptratio 0.28994558
## black -0.38506394
## lstat 0.45562148
## medv -0.38830461
# not very high magnitude of correlation
# +ve: indus, nox, age, rad, tax, ptratio, lstat (Highest: rad)
# -ve: zn, chas, rm, dis, black, medv (Lowest: medv)
## (d)
summary(Boston$crim) # Mean is greater than Median - Skewed Dist.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00632 0.08204 0.25651 3.61352 3.67708 88.97620
High_crime = Boston[which((Boston$crim) > mean(Boston$crim) + 2*sd(Boston$crim)),]
# 16 Obs in output i.e. outside 95% interval
# max = 88.97620 -> very far from mean = 3.61352 -> suggests very high crime rate
# wide range -> 0.006 to 88.976
High_crime_2 = Boston[which((Boston$crim) > mean(Boston$crim) + sd(Boston$crim)),]
# 43 Obs in output
summary(Boston$tax) # Mean and Median closer
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 187.0 279.0 330.0 408.2 666.0 711.0
High_tax = Boston[which((Boston$tax) > mean(Boston$tax) + 2*sd(Boston$tax)),]
# 0 Obs in output, nothing outside 95% interval
High_tax_2 = Boston[which((Boston$tax) > mean(Boston$tax) + sd(Boston$tax)),]
# 137 Obs in output
# range seems fine -> 187 to 711 with 330 Median and 408.2 Mean
summary(Boston$ptratio) # Mean and Median closer
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.60 17.40 19.05 18.46 20.20 22.00
pup_tea_r = Boston[which((Boston$ptratio) > mean(Boston$ptratio) + 2*sd(Boston$ptratio)),]
# O Obs in output
pup_tea_r_2 = Boston[which((Boston$ptratio) > mean(Boston$ptratio) + sd(Boston$ptratio)),]
# 56 Obs in output
# range is narrower -> 12.6 to 22 with 19.05 Median and 18.46 Mean
## (e)
sum(Boston$chas==1) # 35 suburbs are bounded by Charles river
## [1] 35
## (f)
summary(Boston$ptratio) # median 19.05
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12.60 17.40 19.05 18.46 20.20 22.00
## (g)
which(Boston$medv == min(Boston$medv)) # two suburbs - 399 & 406 has lowest medv
## [1] 399 406
Boston[399,] # compairing with summary(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 399 38.3518 0 18.1 0 0.693 5.453 100 1.4896 24 666 20.2 396.9 30.59
## medv
## 399 5
# crim 38.3518 - VERY HIGH compared to mean
# zn 0
# indus 18.1
# chas 0
# nox 0.693
# rm 5.453
# age 100
# dis 1.4896
# rad 24
# tax 666
# ptratio 20.2 - Closer to the MAX value in the dataset
# black 396.9
# lstat 30.59 - Closer to the MAX value in the dataset
# medv 5
Boston[406,] # compairing with summary(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 406 67.9208 0 18.1 0 0.693 5.683 100 1.4254 24 666 20.2 384.97 22.98
## medv
## 406 5
# crim 67.9208 - VERY HIGH compared to mean
# zn 0
# indus 18.1
# chas 0
# nox 0.693
# rm 5.683
# age 100
# dis 1.4254
# rad 24
# tax 666
# ptratio 20.2 - Closer to the MAX value in the dataset
# black 384.97
# lstat 22.98
# medv 5
# a lot of predictors for 399, 406 match or are quite close
## (h)
sum(Boston$rm > 7) # 64
## [1] 64
sum(Boston$rm > 8) # 13
## [1] 13
summary(subset(Boston, rm > 8))
## crim zn indus chas
## Min. :0.02009 Min. : 0.00 Min. : 2.680 Min. :0.0000
## 1st Qu.:0.33147 1st Qu.: 0.00 1st Qu.: 3.970 1st Qu.:0.0000
## Median :0.52014 Median : 0.00 Median : 6.200 Median :0.0000
## Mean :0.71879 Mean :13.62 Mean : 7.078 Mean :0.1538
## 3rd Qu.:0.57834 3rd Qu.:20.00 3rd Qu.: 6.200 3rd Qu.:0.0000
## Max. :3.47428 Max. :95.00 Max. :19.580 Max. :1.0000
## nox rm age dis
## Min. :0.4161 Min. :8.034 Min. : 8.40 Min. :1.801
## 1st Qu.:0.5040 1st Qu.:8.247 1st Qu.:70.40 1st Qu.:2.288
## Median :0.5070 Median :8.297 Median :78.30 Median :2.894
## Mean :0.5392 Mean :8.349 Mean :71.54 Mean :3.430
## 3rd Qu.:0.6050 3rd Qu.:8.398 3rd Qu.:86.50 3rd Qu.:3.652
## Max. :0.7180 Max. :8.780 Max. :93.90 Max. :8.907
## rad tax ptratio black
## Min. : 2.000 Min. :224.0 Min. :13.00 Min. :354.6
## 1st Qu.: 5.000 1st Qu.:264.0 1st Qu.:14.70 1st Qu.:384.5
## Median : 7.000 Median :307.0 Median :17.40 Median :386.9
## Mean : 7.462 Mean :325.1 Mean :16.36 Mean :385.2
## 3rd Qu.: 8.000 3rd Qu.:307.0 3rd Qu.:17.40 3rd Qu.:389.7
## Max. :24.000 Max. :666.0 Max. :20.20 Max. :396.9
## lstat medv
## Min. :2.47 Min. :21.9
## 1st Qu.:3.32 1st Qu.:41.7
## Median :4.14 Median :48.3
## Mean :4.31 Mean :44.2
## 3rd Qu.:5.12 3rd Qu.:50.0
## Max. :7.44 Max. :50.0
# for suburbs that average more than 8 rooms per dwelling:
# crim mean and max is very small as compared to total set
# lstat is also relatively lower
# medv is higher, min value and mean is higher than total set
## (a)
library(ISLR)
summary(Auto)
## mpg cylinders displacement horsepower weight
## Min. : 9.00 Min. :3.000 Min. : 68.0 Min. : 46.0 Min. :1613
## 1st Qu.:17.00 1st Qu.:4.000 1st Qu.:105.0 1st Qu.: 75.0 1st Qu.:2225
## Median :22.75 Median :4.000 Median :151.0 Median : 93.5 Median :2804
## Mean :23.45 Mean :5.472 Mean :194.4 Mean :104.5 Mean :2978
## 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:275.8 3rd Qu.:126.0 3rd Qu.:3615
## Max. :46.60 Max. :8.000 Max. :455.0 Max. :230.0 Max. :5140
##
## acceleration year origin name
## Min. : 8.00 Min. :70.00 Min. :1.000 amc matador : 5
## 1st Qu.:13.78 1st Qu.:73.00 1st Qu.:1.000 ford pinto : 5
## Median :15.50 Median :76.00 Median :1.000 toyota corolla : 5
## Mean :15.54 Mean :75.98 Mean :1.577 amc gremlin : 4
## 3rd Qu.:17.02 3rd Qu.:79.00 3rd Qu.:2.000 amc hornet : 4
## Max. :24.80 Max. :82.00 Max. :3.000 chevrolet chevette: 4
## (Other) :365
lin = lm(mpg ~ horsepower, Auto)
summary(lin)
##
## Call:
## lm(formula = mpg ~ horsepower, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.5710 -3.2592 -0.3435 2.7630 16.9240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.935861 0.717499 55.66 <2e-16 ***
## horsepower -0.157845 0.006446 -24.49 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.906 on 390 degrees of freedom
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
# p-value is close to zero -> strong relationship
# There's a negative relationship, negative slope, -0.15
predict(lin, data.frame(horsepower = 98, interval = "confidence"))
## 1
## 24.46708
predict(lin, data.frame(horsepower = 98, interval = "prediction"))
## 1
## 24.46708
## (b)
plot(mpg ~ horsepower, data = Auto)
abline(lin, lwd = 2, col = "green")
## (c)
par(mfrow = c(2,2))
plot(lin)
## (a)
pairs(Auto[,1:9])
# but 9th is qualitative - "name"
pairs(Auto[,1:8])
## (b)
cor(Auto[,1:8])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## acceleration year origin
## mpg 0.4233285 0.5805410 0.5652088
## cylinders -0.5046834 -0.3456474 -0.5689316
## displacement -0.5438005 -0.3698552 -0.6145351
## horsepower -0.6891955 -0.4163615 -0.4551715
## weight -0.4168392 -0.3091199 -0.5850054
## acceleration 1.0000000 0.2903161 0.2127458
## year 0.2903161 1.0000000 0.1815277
## origin 0.2127458 0.1815277 1.0000000
## (c)
reg = lm(mpg~.-name, data = Auto)
summary(reg)
##
## Call:
## lm(formula = mpg ~ . - name, data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5903 -2.1565 -0.1169 1.8690 13.0604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.218435 4.644294 -3.707 0.00024 ***
## cylinders -0.493376 0.323282 -1.526 0.12780
## displacement 0.019896 0.007515 2.647 0.00844 **
## horsepower -0.016951 0.013787 -1.230 0.21963
## weight -0.006474 0.000652 -9.929 < 2e-16 ***
## acceleration 0.080576 0.098845 0.815 0.41548
## year 0.750773 0.050973 14.729 < 2e-16 ***
## origin 1.426141 0.278136 5.127 4.67e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.328 on 384 degrees of freedom
## Multiple R-squared: 0.8215, Adjusted R-squared: 0.8182
## F-statistic: 252.4 on 7 and 384 DF, p-value: < 2.2e-16
# p-value is quite small for the F-statistic obtained
# reject null hypothese - indicates strong relationship
# Predictors with statistically significant response:
# Looking at p-values - Displacement, Weight, Year and Origin
# Coeff of the year variable = 0.750773
# Implication: Keeping other variables constant, 1 unit increase in year, increases mpg by 0.75
## (d)
par(mfrow = c(2,2))
plot(reg)
# Residual Vs Fitted - Non-Linear - Polynomial Reg could work better
## (e)
interact = lm(mpg~.-name + cylinders:displacement + horsepower:weight, data = Auto)
summary(interact)
##
## Call:
## lm(formula = mpg ~ . - name + cylinders:displacement + horsepower:weight,
## data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.3228 -1.5862 -0.0291 1.5536 11.9296
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.053e+00 4.544e+00 0.892 0.37297
## cylinders -7.357e-01 4.834e-01 -1.522 0.12883
## displacement -2.010e-02 1.584e-02 -1.269 0.20531
## horsepower -2.080e-01 2.683e-02 -7.754 8.17e-14 ***
## weight -1.014e-02 9.351e-04 -10.849 < 2e-16 ***
## acceleration -7.057e-02 8.895e-02 -0.793 0.42805
## year 7.692e-01 4.480e-02 17.168 < 2e-16 ***
## origin 7.159e-01 2.589e-01 2.765 0.00597 **
## cylinders:displacement 3.932e-03 2.165e-03 1.816 0.07008 .
## horsepower:weight 4.699e-05 6.930e-06 6.781 4.55e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.923 on 382 degrees of freedom
## Multiple R-squared: 0.863, Adjusted R-squared: 0.8598
## F-statistic: 267.4 on 9 and 382 DF, p-value: < 2.2e-16
plot(interact)
# Observations:
# No significant change in p-value as compared with original model
# Multiple Rsq increased from 0.8215 to 0.863
# Adj Rsq increased from 0.8182 to 0.8598
# RSE decreased from 3.328 to 2.923
# Conclusion: the combination used seems to be statistically significant
## (f)
random_combination = lm(mpg~.-name + year:cylinders
+ I(horsepower^2) + I(acceleration^2), data = Auto)
summary(random_combination)
##
## Call:
## lm(formula = mpg ~ . - name + year:cylinders + I(horsepower^2) +
## I(acceleration^2), data = Auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9986 -1.5525 -0.1194 1.4348 11.7722
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.394e+01 1.430e+01 -2.374 0.018075 *
## cylinders 8.481e+00 2.340e+00 3.624 0.000329 ***
## displacement -1.106e-02 7.330e-03 -1.509 0.132051
## horsepower -2.720e-01 3.531e-02 -7.703 1.16e-13 ***
## weight -3.338e-03 6.812e-04 -4.900 1.42e-06 ***
## acceleration -1.378e+00 5.421e-01 -2.542 0.011403 *
## year 1.272e+00 1.594e-01 7.982 1.71e-14 ***
## origin 1.027e+00 2.493e-01 4.121 4.63e-05 ***
## I(horsepower^2) 8.040e-04 1.140e-04 7.054 8.22e-12 ***
## I(acceleration^2) 3.351e-02 1.578e-02 2.124 0.034303 *
## cylinders:year -1.056e-01 3.023e-02 -3.493 0.000533 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.935 on 381 degrees of freedom
## Multiple R-squared: 0.8622, Adjusted R-squared: 0.8586
## F-statistic: 238.3 on 10 and 381 DF, p-value: < 2.2e-16
## (a)
library(ISLR)
summary(Carseats)
## Sales CompPrice Income Advertising
## Min. : 0.000 Min. : 77 Min. : 21.00 Min. : 0.000
## 1st Qu.: 5.390 1st Qu.:115 1st Qu.: 42.75 1st Qu.: 0.000
## Median : 7.490 Median :125 Median : 69.00 Median : 5.000
## Mean : 7.496 Mean :125 Mean : 68.66 Mean : 6.635
## 3rd Qu.: 9.320 3rd Qu.:135 3rd Qu.: 91.00 3rd Qu.:12.000
## Max. :16.270 Max. :175 Max. :120.00 Max. :29.000
## Population Price ShelveLoc Age Education
## Min. : 10.0 Min. : 24.0 Bad : 96 Min. :25.00 Min. :10.0
## 1st Qu.:139.0 1st Qu.:100.0 Good : 85 1st Qu.:39.75 1st Qu.:12.0
## Median :272.0 Median :117.0 Medium:219 Median :54.50 Median :14.0
## Mean :264.8 Mean :115.8 Mean :53.32 Mean :13.9
## 3rd Qu.:398.5 3rd Qu.:131.0 3rd Qu.:66.00 3rd Qu.:16.0
## Max. :509.0 Max. :191.0 Max. :80.00 Max. :18.0
## Urban US
## No :118 No :142
## Yes:282 Yes:258
##
##
##
##
predict_sales = lm(Sales~Price+Urban+US, data = Carseats)
summary(predict_sales)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
## (b)
# Qual Predictors: US, Urban
# Quant Predictor: Price
# Interpreration of Model Coeff
# -ve coeff -> Price, UrbanYes; +ve coeff -> USYes
# For every unit increase in price, sales will fall by 54 (0.054*1000)
# Residual Error = 2.472 (Low)
# Multiple RSq = 0.2393 (low)
# For FStat 41.52 -> pvalue close to zero (good fit)
# UrbanYes -> not statistically significant (coeff = -0.021916, t value = -0.081)
## NEED TO CHECK ##
# USYes -> coeff = 1.2 -> If shop in US, there's an average increase in car seat sales of 1200 units
## (c)
attach(Carseats)
contrasts(Urban)
## Yes
## No 0
## Yes 1
contrasts(US)
## Yes
## No 0
## Yes 1
# y = b0 + b1x1 + b2x2 + b3x3
# Sales = 13.043469 + (-0.054459)*Price + (-0.021916)*Urban(Yes:1, No:0) + (1.200573)*US(Yes:1, No:0)
## (d)
all_vars = lm(Sales~., data = Carseats)
summary(all_vars)
##
## Call:
## lm(formula = Sales ~ ., data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8692 -0.6908 0.0211 0.6636 3.4115
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.6606231 0.6034487 9.380 < 2e-16 ***
## CompPrice 0.0928153 0.0041477 22.378 < 2e-16 ***
## Income 0.0158028 0.0018451 8.565 2.58e-16 ***
## Advertising 0.1230951 0.0111237 11.066 < 2e-16 ***
## Population 0.0002079 0.0003705 0.561 0.575
## Price -0.0953579 0.0026711 -35.700 < 2e-16 ***
## ShelveLocGood 4.8501827 0.1531100 31.678 < 2e-16 ***
## ShelveLocMedium 1.9567148 0.1261056 15.516 < 2e-16 ***
## Age -0.0460452 0.0031817 -14.472 < 2e-16 ***
## Education -0.0211018 0.0197205 -1.070 0.285
## UrbanYes 0.1228864 0.1129761 1.088 0.277
## USYes -0.1840928 0.1498423 -1.229 0.220
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.019 on 388 degrees of freedom
## Multiple R-squared: 0.8734, Adjusted R-squared: 0.8698
## F-statistic: 243.4 on 11 and 388 DF, p-value: < 2.2e-16
# Null Hyp: No relation of any predictor with the result
# Looking at tstat vs Std Error
# CompPrice - low coeff but far from zero since tvalue is 22.378
# Income - low coeff but far from zero since tvalue is 8.565
# Advertising - low coeff but far from zero since tvalue is 11.066
# Population - very low coeff and also low tvalue = 0.561
# Price - low negative coeff, high negative tvaluye = -35.700
# ShelveLocGood - high coeff, high tvalue = 31.678
# ShelveLocMedium - high coeff, high tvalue = 15.516
# Age - low coeff, high negative tvalue = -14.42
# Education - low coeff, low negative tvalue = -1.070
# UrbanYes - low coeff, low tvalue = 1.088
# USYes - low coeff, low negative tvalue = -1.229
# No reln: Population, Education, UrbanYes, USYes
# Reln: Reject Null Hyp: CompPrice, Income, Advertising,
# Price, ShelveLocGood, ShelveLocMedium, Age
## (e)
new_model = lm(Sales~CompPrice + Income + Advertising + Price + ShelveLoc + Age, data = Carseats)
summary(new_model)
##
## Call:
## lm(formula = Sales ~ CompPrice + Income + Advertising + Price +
## ShelveLoc + Age, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7728 -0.6954 0.0282 0.6732 3.3292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.475226 0.505005 10.84 <2e-16 ***
## CompPrice 0.092571 0.004123 22.45 <2e-16 ***
## Income 0.015785 0.001838 8.59 <2e-16 ***
## Advertising 0.115903 0.007724 15.01 <2e-16 ***
## Price -0.095319 0.002670 -35.70 <2e-16 ***
## ShelveLocGood 4.835675 0.152499 31.71 <2e-16 ***
## ShelveLocMedium 1.951993 0.125375 15.57 <2e-16 ***
## Age -0.046128 0.003177 -14.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.019 on 392 degrees of freedom
## Multiple R-squared: 0.872, Adjusted R-squared: 0.8697
## F-statistic: 381.4 on 7 and 392 DF, p-value: < 2.2e-16
## (f)
summary(predict_sales)
##
## Call:
## lm(formula = Sales ~ Price + Urban + US, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9206 -1.6220 -0.0564 1.5786 7.0581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.043469 0.651012 20.036 < 2e-16 ***
## Price -0.054459 0.005242 -10.389 < 2e-16 ***
## UrbanYes -0.021916 0.271650 -0.081 0.936
## USYes 1.200573 0.259042 4.635 4.86e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.472 on 396 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2335
## F-statistic: 41.52 on 3 and 396 DF, p-value: < 2.2e-16
summary(new_model)
##
## Call:
## lm(formula = Sales ~ CompPrice + Income + Advertising + Price +
## ShelveLoc + Age, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7728 -0.6954 0.0282 0.6732 3.3292
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.475226 0.505005 10.84 <2e-16 ***
## CompPrice 0.092571 0.004123 22.45 <2e-16 ***
## Income 0.015785 0.001838 8.59 <2e-16 ***
## Advertising 0.115903 0.007724 15.01 <2e-16 ***
## Price -0.095319 0.002670 -35.70 <2e-16 ***
## ShelveLocGood 4.835675 0.152499 31.71 <2e-16 ***
## ShelveLocMedium 1.951993 0.125375 15.57 <2e-16 ***
## Age -0.046128 0.003177 -14.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.019 on 392 degrees of freedom
## Multiple R-squared: 0.872, Adjusted R-squared: 0.8697
## F-statistic: 381.4 on 7 and 392 DF, p-value: < 2.2e-16
# Model in (a) Vs Model in (e) - the second model has:
# lower range of residuals, lower standard errors, higher t-values
# Lower RSE, Higher Rsq and Adj Rsq, much higher F Statistic, not much difference in p-value
# Thus, model in (e) is much better than model in (a)
## (g)
confint(new_model)
## 2.5 % 97.5 %
## (Intercept) 4.48236820 6.46808427
## CompPrice 0.08446498 0.10067795
## Income 0.01217210 0.01939784
## Advertising 0.10071856 0.13108825
## Price -0.10056844 -0.09006946
## ShelveLocGood 4.53585700 5.13549250
## ShelveLocMedium 1.70550103 2.19848429
## Age -0.05237301 -0.03988204
## (h)
par(mfrow=c(2,2))
plot(new_model)
# Fron Residuals vs Fitted chart - evident that good fit
# But there are some outliers - obs 358 (more than 3 residuals away)
set.seed(1)
x = rnorm(100)
y = 2*x + rnorm(100)
## (a)
no_int_reg = lm(y~x+0)
summary(no_int_reg)
##
## Call:
## lm(formula = y ~ x + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9154 -0.6472 -0.1771 0.5056 2.3109
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## x 1.9939 0.1065 18.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9586 on 99 degrees of freedom
## Multiple R-squared: 0.7798, Adjusted R-squared: 0.7776
## F-statistic: 350.7 on 1 and 99 DF, p-value: < 2.2e-16
## (b)
no_int_reg2 = lm(x~y+0)
summary(no_int_reg2)
##
## Call:
## lm(formula = x ~ y + 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8699 -0.2368 0.1030 0.2858 0.8938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## y 0.39111 0.02089 18.73 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4246 on 99 degrees of freedom
## Multiple R-squared: 0.7798, Adjusted R-squared: 0.7776
## F-statistic: 350.7 on 1 and 99 DF, p-value: < 2.2e-16
## (c)
# Observe same t-statistic and p-value for both
# Reg line is same with inverted axis
## (d)
numer = (sqrt(length(x)-1))*sum(x*y)
denom = sqrt(sum(x^2)*sum(y^2) - sum(x*y)^2)
t_stat = numer/denom
t_stat # Matches
## [1] 18.72593
## (e)
# In (d) replacing y with x will give same equation, hence same t_stat
## (f)
with_int_reg = lm(y~x)
summary(with_int_reg)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8768 -0.6138 -0.1395 0.5394 2.3462
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.03769 0.09699 -0.389 0.698
## x 1.99894 0.10773 18.556 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9628 on 98 degrees of freedom
## Multiple R-squared: 0.7784, Adjusted R-squared: 0.7762
## F-statistic: 344.3 on 1 and 98 DF, p-value: < 2.2e-16
# t statistic obtained is almost equal to that obtained with no intercept regression
set.seed(1)
## (a)
x = rnorm(100, mean = 0, sd = 1)
## (b)
eps = rnorm(100, mean = 0, sd = 0.5) # variance = 0.25
## (c)
y = -1 + 0.5*x + eps
# length of y = 100, Bo = -1, B1 = 0.5
## (d)
plot(y~x)
# y and x seem to have a positive linear relationship
## (e)
fitline = lm(y~x)
summary(fitline)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93842 -0.30688 -0.06975 0.26970 1.17309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.01885 0.04849 -21.010 < 2e-16 ***
## x 0.49947 0.05386 9.273 4.58e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4814 on 98 degrees of freedom
## Multiple R-squared: 0.4674, Adjusted R-squared: 0.4619
## F-statistic: 85.99 on 1 and 98 DF, p-value: 4.583e-15
# B0_hat = -0.98, B1_hat = 0.47326
# values quite close to B0 and B1 respectively
## (f)
# abline(fitline, col = "blue")
# abline(a = -1, b = 0.5, col = "red")
# regression line and population line are very close to each other
# p-val near zero and F-stat is large -> null hyp can be rejected
## (g)
poly = lm(y~x + I(x^2))
summary(poly)
##
## Call:
## lm(formula = y ~ x + I(x^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.98252 -0.31270 -0.06441 0.29014 1.13500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.97164 0.05883 -16.517 < 2e-16 ***
## x 0.50858 0.05399 9.420 2.4e-15 ***
## I(x^2) -0.05946 0.04238 -1.403 0.164
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.479 on 97 degrees of freedom
## Multiple R-squared: 0.4779, Adjusted R-squared: 0.4672
## F-statistic: 44.4 on 2 and 97 DF, p-value: 2.038e-14
# RSE for both cases is almost equal -> quadratic term did not improve the fit
## (h)
eps_low = rnorm(100, mean = 0, sd = 0.02)
y = -1 + 0.5*x + eps_low
plot(y~x)
fitline2 = lm(y~x)
summary(fitline)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93842 -0.30688 -0.06975 0.26970 1.17309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.01885 0.04849 -21.010 < 2e-16 ***
## x 0.49947 0.05386 9.273 4.58e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4814 on 98 degrees of freedom
## Multiple R-squared: 0.4674, Adjusted R-squared: 0.4619
## F-statistic: 85.99 on 1 and 98 DF, p-value: 4.583e-15
abline(fitline, col = "blue")
abline(a = -1, b = 0.5, col = "red")
# RSE is very low in this case as noise is very low
## (i)
eps_high = rnorm(100, mean = 0, sd = 1)
y = -1 + 0.5*x + eps_high
plot(y~x)
fitline3 = lm(y~x)
summary(fitline)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93842 -0.30688 -0.06975 0.26970 1.17309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.01885 0.04849 -21.010 < 2e-16 ***
## x 0.49947 0.05386 9.273 4.58e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4814 on 98 degrees of freedom
## Multiple R-squared: 0.4674, Adjusted R-squared: 0.4619
## F-statistic: 85.99 on 1 and 98 DF, p-value: 4.583e-15
abline(fitline, col = "blue")
abline(a = -1, b = 0.5, col = "red")
## (j)
confint(fitline)
## 2.5 % 97.5 %
## (Intercept) -1.1150804 -0.9226122
## x 0.3925794 0.6063602
confint(fitline2)
## 2.5 % 97.5 %
## (Intercept) -1.0036083 -0.9952970
## x 0.4958075 0.5050391
confint(fitline3)
## 2.5 % 97.5 %
## (Intercept) -1.1413399 -0.7433293
## x 0.2232721 0.6653558
set.seed(1)
x1 = runif(100)
x2 = 0.5*x1+rnorm(100)/10
y = 2 + 2*x1 + 0.3*x2 + rnorm(100)
## (a)
# B0 = 2; B1 = 2, B2 = 0.3
## (b)
cor(x1,x2) # 0.835
## [1] 0.8351212
plot(x2~x1)
## (c)
modeling = lm(y~x1+x2)
summary(modeling)
##
## Call:
## lm(formula = y ~ x1 + x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8311 -0.7273 -0.0537 0.6338 2.3359
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1305 0.2319 9.188 7.61e-15 ***
## x1 1.4396 0.7212 1.996 0.0487 *
## x2 1.0097 1.1337 0.891 0.3754
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.056 on 97 degrees of freedom
## Multiple R-squared: 0.2088, Adjusted R-squared: 0.1925
## F-statistic: 12.8 on 2 and 97 DF, p-value: 1.164e-05
# B0_hat = 2.1305, B1_hat = 1.4396, B2_hat = 1.0097
# RSE is high, Error increases from x1 to x2
# p value is less than 0.05 for x1, can reject H0 (null hypothesis)
# p value is much more than 0.05 for x2, cannot reject H0
## (d)
model_x1 = lm(y~x1)
summary(model_x1)
##
## Call:
## lm(formula = y ~ x1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.89495 -0.66874 -0.07785 0.59221 2.45560
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1124 0.2307 9.155 8.27e-15 ***
## x1 1.9759 0.3963 4.986 2.66e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.055 on 98 degrees of freedom
## Multiple R-squared: 0.2024, Adjusted R-squared: 0.1942
## F-statistic: 24.86 on 1 and 98 DF, p-value: 2.661e-06
# RSE and R2 are similar to when using x1+x2, F-statistic is slightly greater
# pval near zero for x1, can reject null hyp
## (e)
model_x2 = lm(y~x2)
summary(model_x2)
##
## Call:
## lm(formula = y ~ x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.62687 -0.75156 -0.03598 0.72383 2.44890
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.3899 0.1949 12.26 < 2e-16 ***
## x2 2.8996 0.6330 4.58 1.37e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.072 on 98 degrees of freedom
## Multiple R-squared: 0.1763, Adjusted R-squared: 0.1679
## F-statistic: 20.98 on 1 and 98 DF, p-value: 1.366e-05
# pval near zero for x2, can reject null hyp
## (f)
# explained using collinearity
## (g)
x1 = c(x1, 0.1)
x2 = c(x2, 0.8)
y = c(y,6)
modeling2 = lm(y~x1+x2)
summary(modeling2)
##
## Call:
## lm(formula = y ~ x1 + x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.73348 -0.69318 -0.05263 0.66385 2.30619
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.2267 0.2314 9.624 7.91e-16 ***
## x1 0.5394 0.5922 0.911 0.36458
## x2 2.5146 0.8977 2.801 0.00614 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.075 on 98 degrees of freedom
## Multiple R-squared: 0.2188, Adjusted R-squared: 0.2029
## F-statistic: 13.72 on 2 and 98 DF, p-value: 5.564e-06
plot(y~x1+x2)
plot(x1, x2)
text(x = 0.1, y = 0.8, col="blue")
# new point seems to be outlier
library(ISLR)
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
# Predicting Per Capita Crime Rate
# (a)
model_zn = lm(crim~zn, data=Boston)
summary(model_zn)
##
## Call:
## lm(formula = crim ~ zn, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.429 -4.222 -2.620 1.250 84.523
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.45369 0.41722 10.675 < 2e-16 ***
## zn -0.07393 0.01609 -4.594 5.51e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.435 on 504 degrees of freedom
## Multiple R-squared: 0.04019, Adjusted R-squared: 0.03828
## F-statistic: 21.1 on 1 and 504 DF, p-value: 5.506e-06
# Coeff: -0.07393
# PVal: 5.506e-06
model_indus = lm(crim~indus, data=Boston)
summary(model_indus)
##
## Call:
## lm(formula = crim ~ indus, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.972 -2.698 -0.736 0.712 81.813
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.06374 0.66723 -3.093 0.00209 **
## indus 0.50978 0.05102 9.991 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.866 on 504 degrees of freedom
## Multiple R-squared: 0.1653, Adjusted R-squared: 0.1637
## F-statistic: 99.82 on 1 and 504 DF, p-value: < 2.2e-16
# Coeff: 0.50978
# PVal: < 2.2e-16
model_chas = lm(crim~chas, data=Boston)
summary(model_chas)
##
## Call:
## lm(formula = crim ~ chas, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.738 -3.661 -3.435 0.018 85.232
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.7444 0.3961 9.453 <2e-16 ***
## chas -1.8928 1.5061 -1.257 0.209
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.597 on 504 degrees of freedom
## Multiple R-squared: 0.003124, Adjusted R-squared: 0.001146
## F-statistic: 1.579 on 1 and 504 DF, p-value: 0.2094
# Coeff: -1.8928
# PVal: 0.2094
model_nox = lm(crim~nox, data=Boston)
summary(model_nox)
##
## Call:
## lm(formula = crim ~ nox, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.371 -2.738 -0.974 0.559 81.728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -13.720 1.699 -8.073 5.08e-15 ***
## nox 31.249 2.999 10.419 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.81 on 504 degrees of freedom
## Multiple R-squared: 0.1772, Adjusted R-squared: 0.1756
## F-statistic: 108.6 on 1 and 504 DF, p-value: < 2.2e-16
# Coeff: 31.249
# PVal: < 2.2e-16
model_rm = lm(crim~rm, data=Boston)
summary(model_rm)
##
## Call:
## lm(formula = crim ~ rm, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.604 -3.952 -2.654 0.989 87.197
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.482 3.365 6.088 2.27e-09 ***
## rm -2.684 0.532 -5.045 6.35e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.401 on 504 degrees of freedom
## Multiple R-squared: 0.04807, Adjusted R-squared: 0.04618
## F-statistic: 25.45 on 1 and 504 DF, p-value: 6.347e-07
# Coeff: -2.684
# PVal: 6.347e-07
model_age = lm(crim~age, data=Boston)
summary(model_age)
##
## Call:
## lm(formula = crim ~ age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.789 -4.257 -1.230 1.527 82.849
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.77791 0.94398 -4.002 7.22e-05 ***
## age 0.10779 0.01274 8.463 2.85e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.057 on 504 degrees of freedom
## Multiple R-squared: 0.1244, Adjusted R-squared: 0.1227
## F-statistic: 71.62 on 1 and 504 DF, p-value: 2.855e-16
# Coeff: 0.10779
# PVal: 2.855e-16
model_dis = lm(crim~dis, data=Boston)
summary(model_dis)
##
## Call:
## lm(formula = crim ~ dis, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.708 -4.134 -1.527 1.516 81.674
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.4993 0.7304 13.006 <2e-16 ***
## dis -1.5509 0.1683 -9.213 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.965 on 504 degrees of freedom
## Multiple R-squared: 0.1441, Adjusted R-squared: 0.1425
## F-statistic: 84.89 on 1 and 504 DF, p-value: < 2.2e-16
# Coeff: -1.5509
# PVal: < 2.2e-16
model_rad = lm(crim~rad, data=Boston)
summary(model_rad)
##
## Call:
## lm(formula = crim ~ rad, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.164 -1.381 -0.141 0.660 76.433
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.28716 0.44348 -5.157 3.61e-07 ***
## rad 0.61791 0.03433 17.998 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.718 on 504 degrees of freedom
## Multiple R-squared: 0.3913, Adjusted R-squared: 0.39
## F-statistic: 323.9 on 1 and 504 DF, p-value: < 2.2e-16
# Coeff: 0.61791
# PVal: < 2.2e-16
model_tax = lm(crim~tax, data=Boston)
summary(model_tax)
##
## Call:
## lm(formula = crim ~ tax, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.513 -2.738 -0.194 1.065 77.696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.528369 0.815809 -10.45 <2e-16 ***
## tax 0.029742 0.001847 16.10 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.997 on 504 degrees of freedom
## Multiple R-squared: 0.3396, Adjusted R-squared: 0.3383
## F-statistic: 259.2 on 1 and 504 DF, p-value: < 2.2e-16
# Coeff: 0.029742
# PVal: < 2.2e-16
model_ptratio = lm(crim~ptratio, data=Boston)
summary(model_ptratio)
##
## Call:
## lm(formula = crim ~ ptratio, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.654 -3.985 -1.912 1.825 83.353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.6469 3.1473 -5.607 3.40e-08 ***
## ptratio 1.1520 0.1694 6.801 2.94e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.24 on 504 degrees of freedom
## Multiple R-squared: 0.08407, Adjusted R-squared: 0.08225
## F-statistic: 46.26 on 1 and 504 DF, p-value: 2.943e-11
# Coeff: 1.1520
# PVal: 2.943e-11
model_black = lm(crim~black, data=Boston)
summary(model_black)
##
## Call:
## lm(formula = crim ~ black, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.756 -2.299 -2.095 -1.296 86.822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.553529 1.425903 11.609 <2e-16 ***
## black -0.036280 0.003873 -9.367 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.946 on 504 degrees of freedom
## Multiple R-squared: 0.1483, Adjusted R-squared: 0.1466
## F-statistic: 87.74 on 1 and 504 DF, p-value: < 2.2e-16
# Coeff: -0.036280
# PVal: < 2.2e-16
model_lstat = lm(crim~lstat, data=Boston)
summary(model_lstat)
##
## Call:
## lm(formula = crim ~ lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.925 -2.822 -0.664 1.079 82.862
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.33054 0.69376 -4.801 2.09e-06 ***
## lstat 0.54880 0.04776 11.491 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.664 on 504 degrees of freedom
## Multiple R-squared: 0.2076, Adjusted R-squared: 0.206
## F-statistic: 132 on 1 and 504 DF, p-value: < 2.2e-16
# Coeff: 0.54880
# PVal: < 2.2e-16
model_medv = lm(crim~medv, data=Boston)
summary(model_medv)
##
## Call:
## lm(formula = crim ~ medv, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.071 -4.022 -2.343 1.298 80.957
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.79654 0.93419 12.63 <2e-16 ***
## medv -0.36316 0.03839 -9.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.934 on 504 degrees of freedom
## Multiple R-squared: 0.1508, Adjusted R-squared: 0.1491
## F-statistic: 89.49 on 1 and 504 DF, p-value: < 2.2e-16
# Coeff: -0.36316
# PVal: < 2.2e-16
# Positive Relationship: indus, nox, age, rad, tax, lstat, ptratio
# Negative Relationship: zn, chas, rm, dis, black, medv
# Looking at pvalues, only chas is statistically insignificant
attach(Boston)
plot(ptratio, crim)
abline(model_ptratio, lwd = 2, col ="green")
plot(dis, crim)
abline(model_dis, lwd = 2, col ="blue")
# (b)
multi = lm(crim~., data = Boston)
summary(multi)
##
## Call:
## lm(formula = crim ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.924 -2.120 -0.353 1.019 75.051
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.033228 7.234903 2.354 0.018949 *
## zn 0.044855 0.018734 2.394 0.017025 *
## indus -0.063855 0.083407 -0.766 0.444294
## chas -0.749134 1.180147 -0.635 0.525867
## nox -10.313535 5.275536 -1.955 0.051152 .
## rm 0.430131 0.612830 0.702 0.483089
## age 0.001452 0.017925 0.081 0.935488
## dis -0.987176 0.281817 -3.503 0.000502 ***
## rad 0.588209 0.088049 6.680 6.46e-11 ***
## tax -0.003780 0.005156 -0.733 0.463793
## ptratio -0.271081 0.186450 -1.454 0.146611
## black -0.007538 0.003673 -2.052 0.040702 *
## lstat 0.126211 0.075725 1.667 0.096208 .
## medv -0.198887 0.060516 -3.287 0.001087 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.439 on 492 degrees of freedom
## Multiple R-squared: 0.454, Adjusted R-squared: 0.4396
## F-statistic: 31.47 on 13 and 492 DF, p-value: < 2.2e-16
# Coeffs
# Zn -0.07393 -> 0.04485
# indus 0.50978 -> -0.063855
# chas -1.8928 -> -0.749134
# nox 31.249 -> -10.313535
# rm -2.684 -> 0.430131
# age 0.10779 -> 0.001452
# dis -1.5509 -> -0.987176
# rad 0.61791 -> 0.588209
# tax 0.029742 -> -0.003780
# ptratio 1.1520 -> -0.271081
# black -0.036280 -> -0.007538
# lstat 0.54880 -> 0.126211
# medv -0.36316 -> -0.198887
# We can see that for some of the variable the coefficients have not only changed in magnitude but also sign
# In simple linear regression, all vriables are ignored while in multiple regression all others are kept constant while analysis the impact of predictor in question
# Thus this change in coefficients and their statistical significance is well justified by collinearity
# Null hypothesis can be rejected for pval < 0.05 -> zn, dis, rad, black, medv
## (c)
# The results comparison between (a) and (b) has been already explained above.
univ <- vector(mode = "numeric", length = 0) # creating empty vector to populate with coefficients
univ <- c(univ, model_zn$coefficients[2]) # R indexing begins from 1
univ <- c(univ, model_indus$coefficients[2])
univ <- c(univ, model_chas$coefficients[2])
univ <- c(univ, model_nox$coefficients[2])
univ <- c(univ, model_rm$coefficients[2])
univ <- c(univ, model_age$coefficients[2])
univ <- c(univ, model_dis$coefficients[2])
univ <- c(univ, model_rad$coefficients[2])
univ <- c(univ, model_tax$coefficients[2])
univ <- c(univ, model_ptratio$coefficients[2])
univ <- c(univ, model_black$coefficients[2])
univ <- c(univ, model_lstat$coefficients[2])
univ <- c(univ, model_medv$coefficients[2])
multiv <- vector(mode = "numeric", length = 0) # creating empty vector to populate with coefficients
multiv <- c(multiv, multi$coefficients)
multiv <- multiv[-1]
par(mfrow = c(1, 1))
plot(univ, multiv, col = 'blue')
## (d)
# Checking Degree 3 Polynomial Fit
model_zn_3 = lm(crim~zn+I(zn^2)+I(zn^3), data=Boston)
summary(model_zn_3)
##
## Call:
## lm(formula = crim ~ zn + I(zn^2) + I(zn^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.821 -4.614 -1.294 0.473 84.130
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.846e+00 4.330e-01 11.192 < 2e-16 ***
## zn -3.322e-01 1.098e-01 -3.025 0.00261 **
## I(zn^2) 6.483e-03 3.861e-03 1.679 0.09375 .
## I(zn^3) -3.776e-05 3.139e-05 -1.203 0.22954
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.372 on 502 degrees of freedom
## Multiple R-squared: 0.05824, Adjusted R-squared: 0.05261
## F-statistic: 10.35 on 3 and 502 DF, p-value: 1.281e-06
# zn
model_indus_3 = lm(crim~indus+I(indus^2)+I(indus^3), data=Boston)
summary(model_indus_3)
##
## Call:
## lm(formula = crim ~ indus + I(indus^2) + I(indus^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.278 -2.514 0.054 0.764 79.713
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6625683 1.5739833 2.327 0.0204 *
## indus -1.9652129 0.4819901 -4.077 5.30e-05 ***
## I(indus^2) 0.2519373 0.0393221 6.407 3.42e-10 ***
## I(indus^3) -0.0069760 0.0009567 -7.292 1.20e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.423 on 502 degrees of freedom
## Multiple R-squared: 0.2597, Adjusted R-squared: 0.2552
## F-statistic: 58.69 on 3 and 502 DF, p-value: < 2.2e-16
# indus, indus^2, indus^3
model_nox_3 = lm(crim~nox+I(nox^2)+I(nox^3), data=Boston)
summary(model_nox_3)
##
## Call:
## lm(formula = crim ~ nox + I(nox^2) + I(nox^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.110 -2.068 -0.255 0.739 78.302
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 233.09 33.64 6.928 1.31e-11 ***
## nox -1279.37 170.40 -7.508 2.76e-13 ***
## I(nox^2) 2248.54 279.90 8.033 6.81e-15 ***
## I(nox^3) -1245.70 149.28 -8.345 6.96e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.234 on 502 degrees of freedom
## Multiple R-squared: 0.297, Adjusted R-squared: 0.2928
## F-statistic: 70.69 on 3 and 502 DF, p-value: < 2.2e-16
# nox, nox^2, nox^3
model_rm_3 = lm(crim~rm+I(rm^2)+I(rm^3), data=Boston)
summary(model_rm_3)
##
## Call:
## lm(formula = crim ~ rm + I(rm^2) + I(rm^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.485 -3.468 -2.221 -0.015 87.219
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 112.6246 64.5172 1.746 0.0815 .
## rm -39.1501 31.3115 -1.250 0.2118
## I(rm^2) 4.5509 5.0099 0.908 0.3641
## I(rm^3) -0.1745 0.2637 -0.662 0.5086
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.33 on 502 degrees of freedom
## Multiple R-squared: 0.06779, Adjusted R-squared: 0.06222
## F-statistic: 12.17 on 3 and 502 DF, p-value: 1.067e-07
# None
model_age_3 = lm(crim~age+I(age^2)+I(age^3), data=Boston)
summary(model_age_3)
##
## Call:
## lm(formula = crim ~ age + I(age^2) + I(age^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.762 -2.673 -0.516 0.019 82.842
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.549e+00 2.769e+00 -0.920 0.35780
## age 2.737e-01 1.864e-01 1.468 0.14266
## I(age^2) -7.230e-03 3.637e-03 -1.988 0.04738 *
## I(age^3) 5.745e-05 2.109e-05 2.724 0.00668 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.84 on 502 degrees of freedom
## Multiple R-squared: 0.1742, Adjusted R-squared: 0.1693
## F-statistic: 35.31 on 3 and 502 DF, p-value: < 2.2e-16
# age^2, age^3
model_dis_3 = lm(crim~dis+I(dis^2)+I(dis^3), data=Boston)
summary(model_dis_3)
##
## Call:
## lm(formula = crim ~ dis + I(dis^2) + I(dis^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.757 -2.588 0.031 1.267 76.378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.0476 2.4459 12.285 < 2e-16 ***
## dis -15.5543 1.7360 -8.960 < 2e-16 ***
## I(dis^2) 2.4521 0.3464 7.078 4.94e-12 ***
## I(dis^3) -0.1186 0.0204 -5.814 1.09e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.331 on 502 degrees of freedom
## Multiple R-squared: 0.2778, Adjusted R-squared: 0.2735
## F-statistic: 64.37 on 3 and 502 DF, p-value: < 2.2e-16
# dis, dis^2, dis^3
model_rad_3 = lm(crim~rad+I(rad^2)+I(rad^3), data=Boston)
summary(model_rad_3)
##
## Call:
## lm(formula = crim ~ rad + I(rad^2) + I(rad^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.381 -0.412 -0.269 0.179 76.217
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.605545 2.050108 -0.295 0.768
## rad 0.512736 1.043597 0.491 0.623
## I(rad^2) -0.075177 0.148543 -0.506 0.613
## I(rad^3) 0.003209 0.004564 0.703 0.482
##
## Residual standard error: 6.682 on 502 degrees of freedom
## Multiple R-squared: 0.4, Adjusted R-squared: 0.3965
## F-statistic: 111.6 on 3 and 502 DF, p-value: < 2.2e-16
# None
model_tax_3 = lm(crim~tax+I(tax^2)+I(tax^3), data=Boston)
summary(model_tax_3)
##
## Call:
## lm(formula = crim ~ tax + I(tax^2) + I(tax^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.273 -1.389 0.046 0.536 76.950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.918e+01 1.180e+01 1.626 0.105
## tax -1.533e-01 9.568e-02 -1.602 0.110
## I(tax^2) 3.608e-04 2.425e-04 1.488 0.137
## I(tax^3) -2.204e-07 1.889e-07 -1.167 0.244
##
## Residual standard error: 6.854 on 502 degrees of freedom
## Multiple R-squared: 0.3689, Adjusted R-squared: 0.3651
## F-statistic: 97.8 on 3 and 502 DF, p-value: < 2.2e-16
# None
model_ptratio_3 = lm(crim~ptratio+I(ptratio^2)+I(ptratio^3), data=Boston)
summary(model_ptratio_3)
##
## Call:
## lm(formula = crim ~ ptratio + I(ptratio^2) + I(ptratio^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.833 -4.146 -1.655 1.408 82.697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 477.18405 156.79498 3.043 0.00246 **
## ptratio -82.36054 27.64394 -2.979 0.00303 **
## I(ptratio^2) 4.63535 1.60832 2.882 0.00412 **
## I(ptratio^3) -0.08476 0.03090 -2.743 0.00630 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.122 on 502 degrees of freedom
## Multiple R-squared: 0.1138, Adjusted R-squared: 0.1085
## F-statistic: 21.48 on 3 and 502 DF, p-value: 4.171e-13
# ptratio, ptratio^2, ptratio^3
model_black_3 = lm(crim~black+I(black^2)+I(black^3), data=Boston)
summary(model_black_3)
##
## Call:
## lm(formula = crim ~ black + I(black^2) + I(black^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.096 -2.343 -2.128 -1.439 86.790
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.826e+01 2.305e+00 7.924 1.5e-14 ***
## black -8.356e-02 5.633e-02 -1.483 0.139
## I(black^2) 2.137e-04 2.984e-04 0.716 0.474
## I(black^3) -2.652e-07 4.364e-07 -0.608 0.544
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.955 on 502 degrees of freedom
## Multiple R-squared: 0.1498, Adjusted R-squared: 0.1448
## F-statistic: 29.49 on 3 and 502 DF, p-value: < 2.2e-16
# None
model_lstat_3 = lm(crim~lstat+I(lstat^2)+I(lstat^3), data=Boston)
summary(model_lstat_3)
##
## Call:
## lm(formula = crim ~ lstat + I(lstat^2) + I(lstat^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.234 -2.151 -0.486 0.066 83.353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.2009656 2.0286452 0.592 0.5541
## lstat -0.4490656 0.4648911 -0.966 0.3345
## I(lstat^2) 0.0557794 0.0301156 1.852 0.0646 .
## I(lstat^3) -0.0008574 0.0005652 -1.517 0.1299
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.629 on 502 degrees of freedom
## Multiple R-squared: 0.2179, Adjusted R-squared: 0.2133
## F-statistic: 46.63 on 3 and 502 DF, p-value: < 2.2e-16
# None
model_medv_3 = lm(crim~medv+I(medv^2)+I(medv^3), data=Boston)
summary(model_medv_3)
##
## Call:
## lm(formula = crim ~ medv + I(medv^2) + I(medv^3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.427 -1.976 -0.437 0.439 73.655
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.1655381 3.3563105 15.840 < 2e-16 ***
## medv -5.0948305 0.4338321 -11.744 < 2e-16 ***
## I(medv^2) 0.1554965 0.0171904 9.046 < 2e-16 ***
## I(medv^3) -0.0014901 0.0002038 -7.312 1.05e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.569 on 502 degrees of freedom
## Multiple R-squared: 0.4202, Adjusted R-squared: 0.4167
## F-statistic: 121.3 on 3 and 502 DF, p-value: < 2.2e-16
# medv, medv^2, medv^3
# From all above summary results we can see that following degree 3 coefficients are statistically significant:
# indus, nox, age, dis, ptratio, medv
library(ISLR)
data(College)
# install.packages("glmnet") # DONE
# install.packages("pls) # DONE
# install.packages("pcr") # DONE
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-4
require(pls)
## Loading required package: pls
##
## Attaching package: 'pls'
## The following object is masked from 'package:stats':
##
## loadings
set.seed(1)
x = model.matrix(Apps~., College)[,-1]
y = College$Apps
## (a)
train = sample(1:nrow(College), round(nrow(College) * 0.6))
test = (-train)
df_train = College[train, ]
df_test = College[-train, ]
## (b)
linear_model = lm(Apps ~ ., data = df_train)
linear_pred = predict(linear_model, df_test)
linear_mse = mean((linear_pred - df_test$Apps)^2) # Output: 1124481.5726
## (c)
grid = 10^seq(10, -2, length = 100)
cv1 = cv.glmnet(x[train,], y[train], alpha = 0)
lam = cv1$lambda.min # 382.88
ridge1 = glmnet(x[train,], y[train], alpha = 0, lambda = grid, thresh = 1e-12)
ridge_pred = predict(ridge1, s = lam, newx = x[test,])
ridge_err = mean((ridge_pred - y[test])^2) # Output = 1033013.5726
## (d)
cv2 = cv.glmnet(x[train,], y[train], alpha = 1)
lam2 = cv2$lambda.min
lasso1 = glmnet(x[train,], y[train], alpha = 1, lambda = grid)
lasso_pred = predict(lasso1, s = lam2, newx = x[test,])
lasso_err = mean((lasso_pred - y[test])^2) # Output = 1113405.8410
lasso.coef = predict(lasso1, type = "coefficients", s = lam2)[1:18,]
length(lasso.coef[lasso.coef != 0]) # Ouput 17
## [1] 17
length(lasso.coef[lasso.coef == 0]) # Ouput 1
## [1] 1
lasso.coef[lasso.coef == 0] # F.Undergrad
## F.Undergrad
## 0
lasso.coef
## (Intercept) PrivateYes Accept Enroll Top10perc
## -665.63058052 -442.72714002 1.70268040 -0.81051715 59.79404848
## Top25perc F.Undergrad P.Undergrad Outstate Room.Board
## -18.44331561 0.00000000 0.06562778 -0.08672713 0.17166996
## Books Personal PhD Terminal S.F.Ratio
## 0.39940807 -0.01122307 -12.29852997 0.93605355 21.66974536
## perc.alumni Expend Grad.Rate
## 1.46131819 0.05675117 6.73310680
## (e)
pcr1 = pcr(Apps~., data = College, subset = train, scale = T, validation = "CV")
validationplot(pcr1, val.type = "MSEP") # Observing from the graph - test for 5, 15, 16
pcr_pred = predict(pcr1, x[test,], ncomp = 5)
mean((pcr_pred - y[test])^2) # 1877569
## [1] 1877569
pcr_pred = predict(pcr1, x[test,], ncomp = 15)
mean((pcr_pred - y[test])^2) # 1238541
## [1] 1238541
pcr_pred = predict(pcr1, x[test,], ncomp = 16)
pcr_err = mean((pcr_pred - y[test])^2) # 1206092 # Selecting for 16
## (f)
pls1 = plsr(Apps~., data = College, subset = train, scale = T, validation = "CV")
validationplot(pls1, val.type = "MSEP") # Observing from the graph - test for 1, 5, 6, 7
pls_pred = predict(pls1, x[test,], ncomp = 1)
mean((pls_pred - y[test])^2) # 2391370
## [1] 2391370
pls_pred = predict(pls1, x[test,], ncomp = 5)
mean((pls_pred - y[test])^2) # 1175328
## [1] 1175328
pls_pred = predict(pls1, x[test,], ncomp = 6)
pls_err = mean((pls_pred - y[test])^2) # 1138191 # Selecting for 6
pls_pred = predict(pls1, x[test,], ncomp = 7)
mean((pls_pred - y[test])^2) # 1150548
## [1] 1150548
## (g)
err = c(linear_mse, ridge_err, lasso_err, pcr_err, pls_err)
barplot(err, xlab = "Model", ylab="Test Error", names = c("linear", "ridge", "lasso", "pcr", "pls"))
# Order of error (highest to lowest)
# PCR, PLS, LINEAR, LASSO, RIDGE
# All models are quite close in performance
# Calculation R2 for accuracy
test_avg = mean(y[test])
linear_r2 = 1 - mean((linear_pred - y[test])^2) / mean((test_avg - y[test])^2) # 0.9118
ridge_r2 = 1 - mean((ridge_pred - y[test])^2) / mean((test_avg - y[test])^2) # 0.9189
lasso_r2 = 1 - mean((lasso_pred - y[test])^2) / mean((test_avg - y[test])^2) # 0.9126
pcr_r2 = 1 - mean((pcr_pred - y[test])^2) / mean((test_avg - y[test])^2) # 0.9054
pls_r2 = 1 - mean((pls_pred - y[test])^2) / mean((test_avg - y[test])^2) # 0.9097
# R2 indicates predictions are close to correct
## (a) - TRY OUT REG METHODS IN CHAPTER - DISCUSS RESULTS
library(MASS)
data(Boston)
### BEST SUBSET SELECTION ###
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
dim(Boston)
## [1] 506 14
sum(is.na(Boston$crim))
## [1] 0
library(leaps)
regfit.full = regsubsets(crim~., Boston)
s1 = summary(regfit.full)
names(s1)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
s1$rsq
## [1] 0.3912567 0.4207965 0.4286123 0.4334892 0.4392738 0.4440173 0.4476594
## [8] 0.4504606
regfit.full2 = regsubsets(crim~., data = Boston, nvmax = 13)
s2 = summary(regfit.full2)
names(s2)
## [1] "which" "rsq" "rss" "adjr2" "cp" "bic" "outmat" "obj"
s2$rsq
## [1] 0.3912567 0.4207965 0.4286123 0.4334892 0.4392738 0.4440173 0.4476594
## [8] 0.4504606 0.4524408 0.4530572 0.4535605 0.4540031 0.4540104
par(mfrow=c(2,2))
plot(s2$rss ,xlab="Number of Variables ",ylab="RSS",
type="l")
plot(s2$adjr2 ,xlab="Number of Variables ",
ylab="Adjusted RSq",type="l")
# Plot makes it evident to keep all variables
set.seed(1)
train = sample(c(TRUE, FALSE), nrow(Boston), rep = TRUE)
test = (!train)
regfit.best = regsubsets(crim~., data = Boston[train,], nvmax = 13)
test.mat = model.matrix(crim~., data = Boston[test,])
val.errors = rep(NA, 13)
for (i in 1:13){
coefi = coef(regfit.best, id = i)
pred = test.mat[,names(coefi)]%*%coefi
val.errors[i] = mean((Boston$crim[test]-pred)^2)
}
val.errors
## [1] 53.58865 54.37254 52.66064 52.50837 51.53113 51.12540 51.04084 50.69984
## [9] 50.43627 50.52382 50.71664 50.68209 50.65678
which.min(val.errors) # 9
## [1] 9
coef(regfit.best, 9)
## (Intercept) zn indus nox dis rad
## 16.49784501 0.04428501 -0.11356470 -6.80041892 -0.87067024 0.48133294
## ptratio black lstat medv
## -0.17759119 -0.01438142 0.12943566 -0.13215744
regfit.best = regsubsets(crim~., data = Boston, nvmax = 13)
coef(regfit.best, 9)
## (Intercept) zn indus nox dis
## 19.124636156 0.042788127 -0.099385948 -10.466490364 -1.002597606
## rad ptratio black lstat medv
## 0.539503547 -0.270835584 -0.008003761 0.117805932 -0.180593877
k = 10
set.seed(1)
folds = sample(1:k, nrow(Boston), replace = TRUE)
cv.errors = matrix(NA, k, 13, dimnames = list(NULL, paste(1:13)))
predict.regsubsets = function (object ,newdata ,id ,...){
form=as.formula(object$call [[2]])
mat=model.matrix(form,newdata)
coefi=coef(object ,id=id)
xvars=names(coefi)
mat[,xvars]%*%coefi
}
for(j in 1:k){
best.fit = regsubsets(crim~., data = Boston[folds!=j,], nvmax = 13)
for (i in 1:13){
pred = predict(best.fit, Boston[folds == j,], id = i)
cv.errors[j, i] = mean((Boston$crim[folds==j]-pred)^2)
}
}
mean.cv.errors = apply(cv.errors, 2, mean)
mean.cv.errors
## 1 2 3 4 5 6 7 8
## 45.44573 43.87260 43.94979 44.02424 43.96415 43.96199 42.96268 42.66948
## 9 10 11 12 13
## 42.53822 42.73416 42.52367 42.46014 42.50125
par(mfrwo=c(1,1))
## Warning in par(mfrwo = c(1, 1)): "mfrwo" is not a graphical parameter
plot(mean.cv.errors, type = 'b')
# cv error lowest for model with 9 variables
mean.cv.errors[9] # Output: 42.53822
## 9
## 42.53822
reg.best = regsubsets(crim~., data = Boston, nvmax = 13)
coef(reg.best, 9)
## (Intercept) zn indus nox dis
## 19.124636156 0.042788127 -0.099385948 -10.466490364 -1.002597606
## rad ptratio black lstat medv
## 0.539503547 -0.270835584 -0.008003761 0.117805932 -0.180593877
### RIDGE REGRESSION ###
x = model.matrix(crim~., Boston)[,-1]
y = Boston$crim
library(glmnet)
grid = 10^seq(10, -2, length = 100)
ridge.mod = glmnet(x, y, alpha = 0, lambda = grid)
dim(coef(ridge.mod)) # 14 100
## [1] 14 100
ridge.mod$lambda[50] # 11497.57
## [1] 11497.57
coef(ridge.mod)[,50]
## (Intercept) zn indus chas nox
## 3.589847e+00 -5.493033e-05 3.793684e-04 -1.412990e-03 2.325693e-02
## rm age dis rad tax
## -1.996087e-03 8.018407e-05 -1.153934e-03 4.604414e-04 2.214944e-05
## ptratio black lstat medv
## 8.570813e-04 -2.702161e-05 4.083826e-04 -2.701952e-04
ridge.mod$lambda[60] # 705.4802
## [1] 705.4802
coef(ridge.mod)[,60]
## (Intercept) zn indus chas nox
## 3.2516685945 -0.0007972592 0.0057051072 -0.0225316252 0.3514124032
## rm age dis rad tax
## -0.0297634708 0.0012055897 -0.0174560167 0.0071446717 0.0003412896
## ptratio black lstat medv
## 0.0130090009 -0.0004177804 0.0062311863 -0.0041201724
predict(ridge.mod, s = 50, type = "coefficients")[1:14,]
## (Intercept) zn indus chas nox rm
## 0.838036330 -0.002577783 0.035801043 -0.249700752 2.367151149 -0.166588894
## age dis rad tax ptratio black
## 0.007679277 -0.121899702 0.065407357 0.002877986 0.091353127 -0.003626702
## lstat medv
## 0.047688080 -0.031157551
set.seed (1)
train=sample(1:nrow(x), nrow(x)/2)
test=(-train)
y.test=y[test]
ridge.mod = glmnet(x[train,], y[train], alpha = 0, lambda = grid, thresh = 1e-12)
ridge.pred = predict(ridge.mod, s = 4, newx = x[test,])
mean((ridge.pred-y.test)^2) # 40.77723
## [1] 40.77723
mean((mean(y[train])-y.test)^2) # 62.68428
## [1] 62.68428
ridge.pred = predict(ridge.mod, s = 1e10, newx = x[test,])
mean((ridge.pred-y.test)^2) # 62.68428
## [1] 62.68428
ridge.pred = predict(ridge.mod, s = 0, newx = x[test,])
mean((ridge.pred-y.test)^2) # 41.51969
## [1] 41.51969
### LASSO ###
lasso.mod = glmnet(x[train,], y[train], alpha = 1, lambda = grid)
plot(lasso.mod)
## Warning in regularize.values(x, y, ties, missing(ties), na.rm = na.rm):
## collapsing to unique 'x' values
set.seed(1)
cv.out = cv.glmnet(x[train,], y[train], alpha = 1)
plot(cv.out)
bestlam = cv.out$lambda.min
lasso.pred = predict(lasso.mod, s = bestlam, newx = x[test,])
mean((lasso.pred-y.test)^2) # 40.99066
## [1] 40.99066
out = glmnet(x, y, alpha = 1, lambda = grid)
lasso.coef = predict(out, type = "coefficients", s = bestlam)[1:14,]
lasso.coef
## (Intercept) zn indus chas nox
## 1.269174e+01 3.628905e-02 -7.012417e-02 -5.842484e-01 -6.989138e+00
## rm age dis rad tax
## 2.308650e-01 0.000000e+00 -7.886241e-01 5.146154e-01 -2.235567e-05
## ptratio black lstat medv
## -1.884305e-01 -7.544153e-03 1.248260e-01 -1.582852e-01
### PRINCIPAL COMPONENT REGRESSION ###
library(pls)
set.seed(2)
pcr.fit = pcr(crim~., data = Boston, scale = TRUE, validation = "CV")
summary(pcr.fit)
## Data: X dimension: 506 13
## Y dimension: 506 1
## Fit method: svdpc
## Number of components considered: 13
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 8.61 7.229 7.227 6.814 6.799 6.795 6.794
## adjCV 8.61 7.225 7.222 6.807 6.789 6.788 6.787
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 6.787 6.654 6.664 6.673 6.676 6.651 6.573
## adjCV 6.780 6.645 6.656 6.664 6.666 6.639 6.562
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 47.70 60.36 69.67 76.45 82.99 88.00 91.14 93.45
## crim 30.69 30.87 39.27 39.61 39.61 39.86 40.14 42.47
## 9 comps 10 comps 11 comps 12 comps 13 comps
## X 95.40 97.04 98.46 99.52 100.0
## crim 42.55 42.78 43.04 44.13 45.4
validationplot(pcr.fit, val.type = "MSEP")
set.seed(1)
pcr.fit = pcr(crim~., data = Boston, subset = train, scale = TRUE, validation = "CV")
validationplot(pcr.fit, val.type = "MSEP")
# how to select ncomp from graph? It's decreasing throughout
# 8
pcr.pred = predict(pcr.fit, x[test,], ncomp = 8)
mean((pcr.pred-y.test)^2) # 43.21097
## [1] 43.21097
pcr.pred = predict(pcr.fit, x[test,], ncomp = 13)
mean((pcr.pred-y.test)^2) # 41.54639
## [1] 41.54639
pcr.pred = predict(pcr.fit, x[test,], ncomp = 4)
mean((pcr.pred-y.test)^2) # 43.91107
## [1] 43.91107
### PARTIAL LEAST SQUARES ###
set.seed(1)
pls.fit = plsr(crim~., data = Boston, subset = train, scale = TRUE, validation = "CV")
summary(pls.fit)
## Data: X dimension: 253 13
## Y dimension: 253 1
## Fit method: kernelpls
## Number of components considered: 13
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 9.275 7.328 6.842 6.784 6.741 6.718 6.695
## adjCV 9.275 7.322 6.834 6.768 6.728 6.707 6.683
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 6.722 6.707 6.696 6.701 6.700 6.700 6.700
## adjCV 6.706 6.693 6.683 6.688 6.686 6.686 6.686
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 48.10 57.46 62.68 69.65 77.95 80.82 84.40 87.65
## crim 38.94 47.55 49.73 50.61 50.85 51.17 51.29 51.35
## 9 comps 10 comps 11 comps 12 comps 13 comps
## X 90.47 93.07 96.66 98.26 100.00
## crim 51.37 51.37 51.37 51.37 51.37
validationplot(pls.fit, val.type = "MSEP")
# how to select ncomp
pls.pred = predict(pls.fit, x[test,], ncomp = 2)
mean((pls.pred-y.test)^2) # 42.74086
## [1] 42.74086
pls.fot = plsr(crim~., data = Boston, scale = TRUE, ncomp = 2)
summary(pls.fit)
## Data: X dimension: 253 13
## Y dimension: 253 1
## Fit method: kernelpls
## Number of components considered: 13
##
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
## (Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps
## CV 9.275 7.328 6.842 6.784 6.741 6.718 6.695
## adjCV 9.275 7.322 6.834 6.768 6.728 6.707 6.683
## 7 comps 8 comps 9 comps 10 comps 11 comps 12 comps 13 comps
## CV 6.722 6.707 6.696 6.701 6.700 6.700 6.700
## adjCV 6.706 6.693 6.683 6.688 6.686 6.686 6.686
##
## TRAINING: % variance explained
## 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps
## X 48.10 57.46 62.68 69.65 77.95 80.82 84.40 87.65
## crim 38.94 47.55 49.73 50.61 50.85 51.17 51.29 51.35
## 9 comps 10 comps 11 comps 12 comps 13 comps
## X 90.47 93.07 96.66 98.26 100.00
## crim 51.37 51.37 51.37 51.37 51.37
## (b) - Propose models that perform well, evaluate using cross validation
## (c) - does chosen model involve all features - why or why not
# I would choose the Lasso model, as it gives the lowest test mse.
# Lasso models are generally more interpretable.
# It results in a sparse model with 10 variables.
# Two variables whose effect on the response were below the required threshold were removed.
library(ISLR)
data(Carseats)
dim(Carseats)
## [1] 400 11
## (a)
set.seed(1)
train = sample(c(TRUE, FALSE), nrow(Carseats), rep = TRUE)
test = (!train)
## (b)
par(mfrow = c(1, 1))
library(tree)
tree_model = tree(Sales~., Carseats[train,])
summary(tree_model)
##
## Regression tree:
## tree(formula = Sales ~ ., data = Carseats[train, ])
## Variables actually used in tree construction:
## [1] "ShelveLoc" "Price" "Advertising" "Income" "CompPrice"
## [6] "Age"
## Number of terminal nodes: 16
## Residual mean deviance: 2.394 = 442.9 / 185
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.74400 -0.89700 -0.05937 0.00000 0.93060 6.66900
plot(tree_model)
text(tree_model)
tree_pred = predict(tree_model, Carseats[test,])
test_mse = mean((tree_pred - Carseats[test,]$Sales)^2) # 5.207756
# Points to note:
# Variables actually used in the tree construction: ShelveLoc, Price, Advertising, Income, CompPrice, Age
# No. of Terminal Nodes: 16
# Test MSE = 5.2
## (c)
set.seed(2)
cv_car = cv.tree(tree_model)
summary(cv_car)
## Length Class Mode
## size 15 -none- numeric
## dev 15 -none- numeric
## k 15 -none- numeric
## method 1 -none- character
plot(cv_car$size, cv_car$dev, xlab = "Terminal Nodes", ylab = "CV Error", type = "b")
# Optimal value = 9 Terminal Nodes
prune_car = prune.tree(tree_model, best = 9)
tree_pred2 = predict(prune_car, Carseats[test,])
test_mse2 = mean((tree_pred2 - Carseats[test,]$Sales)^2) # 4.98545
# Pruning the tree slightly improved the MSE: 5.2 -> 4.98
## (d)
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
set.seed(2)
bag_car = randomForest(Sales~., data = Carseats[train,], mtry = 10, importance = T)
importance(bag_car)
## %IncMSE IncNodePurity
## CompPrice 19.6211795 131.699726
## Income 7.5973882 97.853410
## Advertising 17.4351356 160.290717
## Population 2.4855317 71.032635
## Price 49.0567693 441.543542
## ShelveLoc 52.5432634 450.338179
## Age 13.5076447 133.766357
## Education 0.8161629 47.635165
## Urban 0.9596331 12.990954
## US 0.2780947 7.638444
bag_pred = predict(bag_car, Carseats[test,])
test_mse_bag = mean((bag_pred - Carseats[test,]$Sales)^2)
# randomForest is not available for this version for R
## (e)
set.seed(2)
car1 = randomForest(Sales~.,data=Carseats[train,],mtry=10/2,importance=T)
importance(car1)
## %IncMSE IncNodePurity
## CompPrice 15.7914168 135.98350
## Income 4.0950976 117.45341
## Advertising 18.8770165 180.54746
## Population 3.8245971 92.60216
## Price 36.0921926 367.09517
## ShelveLoc 44.6241144 405.67345
## Age 12.2477216 152.44846
## Education -1.6897207 55.36410
## Urban -0.3899192 13.77597
## US 2.9518607 12.96404
car2 = randomForest(Sales~.,data=Carseats[train,],mtry=sqrt(10),importance=T)
importance(car2)
## %IncMSE IncNodePurity
## CompPrice 8.5310606 141.37353
## Income 4.4891172 132.78327
## Advertising 16.0144551 181.80453
## Population 1.1012824 119.31739
## Price 31.0901404 317.19937
## ShelveLoc 35.0344742 343.81355
## Age 10.7246390 159.10963
## Education -1.6358709 66.17645
## Urban -0.8244829 19.05945
## US 1.2331181 16.22396
car3 = randomForest(Sales~.,data=Carseats[train,],mtry=10/4,importance=T)
importance(car3)
## %IncMSE IncNodePurity
## CompPrice 7.6759698 134.76573
## Income 2.7391782 145.55262
## Advertising 15.0496256 178.94052
## Population -0.8309692 130.85918
## Price 24.1485531 276.78857
## ShelveLoc 30.8297555 306.68708
## Age 10.9598221 161.34502
## Education -1.8549693 76.27444
## Urban -0.1916890 22.35433
## US 4.1864982 24.86787
varImpPlot(car2)
library(ISLR)
attach(Caravan)
?Caravan # Insurance Company Benchmark - 5822 records - 86 vars
plot(Caravan$Purchase)
## (a)
Caravan$PurchaseYES = rep(NA, 5822)
for (i in 1:5822) if (Caravan$Purchase[i] == "Yes")
(Caravan$PurchaseYES[i]=1) else (Caravan$PurchaseYES[i]=0)
train_set = Caravan[1:1000,]
test_set = Caravan[1001:5822,]
## (b)
library(gbm)
## Loaded gbm 2.1.8
set.seed(10)
boost_model = gbm(PurchaseYES~.-Purchase, data = train_set,
distribution = "bernoulli", n.trees = 1000, shrinkage = 0.01)
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution, :
## variable 50: PVRAAUT has no variation.
## Warning in gbm.fit(x = x, y = y, offset = offset, distribution = distribution, :
## variable 71: AVRAAUT has no variation.
summary(boost_model)
## var rel.inf
## PPERSAUT PPERSAUT 15.38257285
## MKOOPKLA MKOOPKLA 10.46605387
## MOPLHOOG MOPLHOOG 7.55417441
## PBRAND PBRAND 4.91302326
## MBERMIDD MBERMIDD 4.75162262
## ABRAND ABRAND 4.10695497
## MGODGE MGODGE 4.08460852
## MINK3045 MINK3045 3.57754339
## MOSTYPE MOSTYPE 2.95142262
## MSKA MSKA 2.31330767
## MSKC MSKC 2.26066183
## MBERARBG MBERARBG 2.23298183
## PBYSTAND PBYSTAND 2.12038372
## PWAPART PWAPART 2.11822298
## MAUT2 MAUT2 2.09432337
## MSKB1 MSKB1 2.04625792
## MAUT1 MAUT1 2.01682392
## MGODOV MGODOV 1.81440468
## MGODPR MGODPR 1.76389034
## MBERHOOG MBERHOOG 1.62253130
## MRELGE MRELGE 1.61409689
## MINKGEM MINKGEM 1.54973694
## MRELOV MRELOV 1.16048896
## MHHUUR MHHUUR 1.04601208
## MGODRK MGODRK 1.00271859
## MFWEKIND MFWEKIND 0.98670997
## MAUT0 MAUT0 0.95127063
## MGEMLEEF MGEMLEEF 0.92105017
## MINKM30 MINKM30 0.85528510
## APERSAUT APERSAUT 0.83742532
## MINK7512 MINK7512 0.81474065
## MOPLMIDD MOPLMIDD 0.76091591
## MGEMOMV MGEMOMV 0.72333895
## MFGEKIND MFGEKIND 0.72106858
## MOSHOOFD MOSHOOFD 0.69538457
## PLEVEN PLEVEN 0.66896464
## MINK123M MINK123M 0.58561784
## PMOTSCO PMOTSCO 0.53489044
## MBERBOER MBERBOER 0.45829619
## MZFONDS MZFONDS 0.43597782
## MSKD MSKD 0.42079495
## MINK4575 MINK4575 0.37339174
## MSKB2 MSKB2 0.35869222
## MZPART MZPART 0.34433413
## MHKOOP MHKOOP 0.29551164
## MBERARBO MBERARBO 0.22410217
## MBERZELF MBERZELF 0.18600145
## MRELSA MRELSA 0.18042416
## MAANTHUI MAANTHUI 0.05935146
## ALEVEN ALEVEN 0.04163979
## MFALLEEN MFALLEEN 0.00000000
## MOPLLAAG MOPLLAAG 0.00000000
## PWABEDR PWABEDR 0.00000000
## PWALAND PWALAND 0.00000000
## PBESAUT PBESAUT 0.00000000
## PVRAAUT PVRAAUT 0.00000000
## PAANHANG PAANHANG 0.00000000
## PTRACTOR PTRACTOR 0.00000000
## PWERKT PWERKT 0.00000000
## PBROM PBROM 0.00000000
## PPERSONG PPERSONG 0.00000000
## PGEZONG PGEZONG 0.00000000
## PWAOREG PWAOREG 0.00000000
## PZEILPL PZEILPL 0.00000000
## PPLEZIER PPLEZIER 0.00000000
## PFIETS PFIETS 0.00000000
## PINBOED PINBOED 0.00000000
## AWAPART AWAPART 0.00000000
## AWABEDR AWABEDR 0.00000000
## AWALAND AWALAND 0.00000000
## ABESAUT ABESAUT 0.00000000
## AMOTSCO AMOTSCO 0.00000000
## AVRAAUT AVRAAUT 0.00000000
## AAANHANG AAANHANG 0.00000000
## ATRACTOR ATRACTOR 0.00000000
## AWERKT AWERKT 0.00000000
## ABROM ABROM 0.00000000
## APERSONG APERSONG 0.00000000
## AGEZONG AGEZONG 0.00000000
## AWAOREG AWAOREG 0.00000000
## AZEILPL AZEILPL 0.00000000
## APLEZIER APLEZIER 0.00000000
## AFIETS AFIETS 0.00000000
## AINBOED AINBOED 0.00000000
## ABYSTAND ABYSTAND 0.00000000
# Most important predictor: PPERSAUT, MKOOPKLA, MOPLHOOG
## (c)
pro = predict(boost_model, newdata = test_set, n.trees = 1000, type = "response")
dim(test_set) # 4822 87
## [1] 4822 87
pred = rep("No, 4822")
pred[pro>0.20] = "Yes"
act = test_set$Purchase
table(act, pred)
## pred
## act No, 4822 Yes
## No 1 108
## Yes 0 31